How complete are the CDC's COVID-19 case surveillance and NCHS datasets for deaths by race/ethnicity at the state and county levels?

March 8, 2021

In [ ]:
#@title
import pandas as pd
import altair as alt
from vega_datasets import data

from google.colab import auth
auth.authenticate_user()

# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
#%load_ext google.colab.data_table
In [2]:
#@title
# We are no longer updating the data because the CRDT has stopped data collection, so there's no
# good comparison for the race/ethnicity data. However, if you want to update the CDC data on 
# its own or only compare to the NYT data that doesn't have race/ethnicity, see instructions below.
# How to update the data:
# 0. You may need to copy this colab so you have your own version.
# 1. Update the cdc_table to have the latest data's suffix.
# 2. Update the date variables below to be the last case date included in the data.
#    If the CRDT doesn't have data on that exact date, choose the closest date for crdt_date.
# 3. If the last case date is after Feb 2, 2021, you'll need to upload a new version
#    of the crdt data to compare against and change the crdt_table name below.
# 4. Scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
#    Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
#    the paper including text and tables, a lot of that is done in commented out PrintSummaryStats().

project_id = 'msm-secure-data-1b'
cdc_table = '`%s.ndunlap_secure.cdc_restricted_access_20210228`' % project_id
crdt_table = '`%s.ndunlap_secure.crdt_20210305`' % project_id
date = 'DATE(2021, 02, 13)'
crdt_date = '20210214'
date_display_name = 'Feb 13'

# Provisional deaths data dates.
provisional_date = 'DATE(2021, 3, 3)'
crdt_provisional_date = 20210303
provisional_date_display_name = 'Mar 3'
provisional_deaths_state_table = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210303`'
provisional_deaths_county_table = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_county_20210303`'

# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
total_cases_scale_max = 55000
county_cases_scale_max = 22000
county_cases_zoom_scale_max = 3000
cases_known_scale_max = 50000 # known race/ethnicity

# Chart settings.
scatter_height = 300
scatter_width = 300
map_height = 300
map_width = 450
us_states = alt.topo_feature(data.us_10m.url, 'states')
us_counties = alt.topo_feature(data.us_10m.url+"#", 'counties')

territories = ('PR', 'GU', 'VI', 'MP', 'AS')
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
states_to_fips = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NYC': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78}
states_to_abbreviations = {'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District of Columbia': 'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}
race_ethnicity_combined_map = {
    'Asian, Non-Hispanic': 'asian_cases',
    'Black, Non-Hispanic': 'black_cases',
    'White, Non-Hispanic': 'white_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
    'Hispanic/Latino': 'hispanic_cases',
    'Multiple/Other, Non-Hispanic': 'other_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
    'Missing': 'unknown_cases',
    'Unknown': 'unknown_cases',
    'NA': 'na_cases',
}
In [3]:
#@title
crdt_query = ('''
SELECT
  State as state,
  Deaths_Total as crdt_cases,
  Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
  ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM %s
WHERE
  date = %s
''' % (crdt_table, crdt_date))

nyt_states_query = ('''
SELECT
  state_name,
  state_fips_code,
  deaths as nyt_cases,
  deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
  date = %s AND
  state_fips_code IS NOT NULL
''' % date)

nyt_counties_query = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s AND
  county_fips_code IS NOT NULL
''' % date)

cdc_states_query = ('''
SELECT
  res_state,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state
''' % cdc_table)

cdc_counties_query = ('''
SELECT
  res_state,
  res_county,
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % cdc_table)

compare_cases_unknowns_query = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state,
   race_ethnicity_combined
''' % cdc_table)

cdc_states_by_month_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as total_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2021, 1, 1) AND
  res_state in ('AK', 'CA', 'CT', 'DE', 'GA', 'LA', 'MD', 'ND', 'NY', 'PA', 'RI')
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_states_by_month_known_or_na_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as known_or_na_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2021, 1, 1) AND
  race_ethnicity_combined != 'Unknown' AND
  race_ethnicity_combined != 'Missing'
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_overall_query = ('''
SELECT
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   1
''' % cdc_table)

cdc_provisional_deaths_state_overall_query = ('''
SELECT
  *
FROM
  %s
WHERE
  Indicator = "Distribution of COVID-19 deaths (%%)" AND
  State = 'United States'
''' % provisional_deaths_state_table)

cdc_provisional_deaths_state_counts_overall_query = ('''
SELECT
  *
FROM
  %s
WHERE
  Indicator = "Count of COVID-19 deaths" AND
  State = 'United States'
''' % provisional_deaths_state_table)

county_fips_mapping_query = ('''
SELECT
*
FROM
  `msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')

acs_population_data_query = ('''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''')
In [4]:
#@title
# Provisional deaths data queries.
cdc_provisional_deaths_county_query = ('''
SELECT
  *
FROM %s
WHERE
  Indicator = "Distribution of COVID-19 deaths (%%)"
''' % provisional_deaths_county_table)

cdc_provisional_deaths_state_query = ('''
SELECT
  *
FROM %s
WHERE
  Indicator = "Distribution of COVID-19 deaths (%%)" AND
  State != 'United States'
''' % provisional_deaths_state_table)

cdc_provisional_deaths_state_counts_query = ('''
SELECT
  *
FROM %s
WHERE
  Indicator = "Count of COVID-19 deaths" AND
  State != 'United States'
''' % provisional_deaths_state_table)

crdt_provisional_query = ('''
SELECT
  State as state,
  Deaths_Total as crdt_cases,
  Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
  ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM %s
WHERE
  date = %s
''' % (crdt_table, crdt_provisional_date))

nyt_counties_provisional_query = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s  AND
  county_fips_code IS NOT NULL
''' % provisional_date)
In [5]:
#@title
def FieldAnalysis(project_id, table, field_list):
  dict = {}
  for field in field_list:
      dict[field] = [0.0, 0.0, 0.0, 0.0]
  unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
  field_series = []
  value_series = []
  percent_series = []

  for field in field_list:
    field_unknowns_query = ('''
    SELECT
      %s,
      count(*) as cases
    FROM
      %s
    WHERE
      death_yn = 'Yes'
    GROUP BY
      %s
    ''')
    query = field_unknowns_query % (field, table, field)
    field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
    field_unknowns_df.set_index(field, inplace=True)
    field_unknowns_df.index = field_unknowns_df.index.fillna('Null')

    field_display_name = {
        'cdc_case_earliest_dt': 'CDC earliest case date',
        'current_status': 'Case status',
        'res_state': 'State',
        'res_county': 'County',
        'sex': 'Sex',
        'age_group': 'Age',
        'race_ethnicity_combined': 'Race/Ethnicity'}

    missing_count = 0
    if 'Missing' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Missing'].cases
    if 'Null' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Null'].cases
    if '' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc[''].cases
    if 'OTH' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['OTH'].cases
    if 'nul' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['nul'].cases
    #if field_unknowns_df.index.isnull().any():
    #  missing_count += field_unknowns_df.loc[field_unknowns_df.index.isnull()].cases
    unknowns.loc['Missing', field] = missing_count / field_unknowns_df.cases.sum()

    if 'Unknown' in field_unknowns_df.index:
      unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases / field_unknowns_df.cases.sum()
    if 'NA' in field_unknowns_df.index:
      unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases / field_unknowns_df.cases.sum()
    unknowns.loc['Known', field] = 1 - (unknowns.loc['Missing', field] +
                                        unknowns.loc['Unknown', field] +
                                        unknowns.loc['NA', field])
    field_series.extend([field_display_name.get(field, field)] * 4)
    value_series.extend(['Known', 'Supressed', 'Unknown', 'Missing'])
    percent_series.extend([unknowns.loc['Known', field],
                           unknowns.loc['NA', field],
                           unknowns.loc['Unknown', field],
                           unknowns.loc['Missing', field]])
  test = pd.DataFrame.from_dict({'field': field_series,
                               'value': value_series,
                               'percent': percent_series})
  return alt.Chart(test).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
      y=alt.Y('field', sort='x', title='Field'),
      color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
      order=alt.Order('field:N'),
      tooltip=[
                  alt.Tooltip('field:N', title='Field'),
                  alt.Tooltip('value:N', title='Value'),
                  alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
      ]
  )

Abstract

The Covid Tracking Project was the most reliable source for COVID-19 data by race/ethnicity at the state level until it stopped collecting data on March 7, 2021. The CDC's Case Surveillance Restricted Access dataset and the CDC's National Center for Health Statistics provisional deaths dataset could potentially replace the Covid Tracking Project's dataset and also enable analyses of age and race/ethnicity along with county-level data. This paper evaluates the usability of these datasets at the state and county levels in terms of (1) the total number of deaths included compared to the Covid Tracking Project and the New York Times, and (2) the number of deaths included with race/ethnicity data. The CDC's Case Surveillance Restricted Access dataset's completeness is highly variable across states and counties with 10 states reporting fewer than 10% of deaths and five reporting 0% of the deaths in the Covid Tracking Project. Thankfully, the National Center for Health Statistics' dataset is highly complete and a suitable replacement for the Covid Tracking Project in all states except for North Carolina. At the county level, the tradeoffs between the two datasets get more complicated; the National Center for Health Statistics is more complete in the counties it contains, but it is limited to counties that have had at least 100 COVID-19 deaths, which are generally counties with larger populations.

Background

This analysis picks up where the case data completeness analysis left off to evaluate COVID-19 deaths data with race/ethnicity at the state and county levels.

While we only have two options for COVID-19 case data with race/ethnicity, we have four options for COVID-19 deaths data with race/ethnicity:

  1. Covid Racial Data Tracker: States updated twice a week until March 7, 2021
  2. CDC Case Surveillance Restricted Access data: States and counties updated monthly
  3. CDC's National Center for Health Statistics (NCHS) provisional deaths data from death certificates: States and counties updated weekly
  4. American Public Media Research Lab: States updated monthly

In the case data completeness analysis, we compared the CDC Case Surveillance dataset to the Covid Racial Data Tracker (CRDT). In this analysis, we'll compare the CDC Case Surveillance dataset to the CRDT and the NCHS datasets. We will not analyze the American Public Media Research Lab dataset because they use a mixture of data from state public health websites for most states, as the CRDT does, and NCHS data in 14 states where the data are unavailable or less complete than on public health websites.

There are completeness issues with using the CDC case dataset for deaths:

  • Only 78% of total deaths in the Covid Tracking Project up to February 13 are included (102K out of 474K cases are missing)
  • Of those deaths, 75% have known race/ethnicity (93K out of 372K deaths are missing race/ethnicity)

For the 279K deaths where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:

In [6]:
#@title
overall_df = pd.io.gbq.read_gbq(cdc_overall_query, project_id=project_id)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
overall_df = overall_df.set_index('race_ethnicity_combined')

chart_denominator = 1000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
         overall_df.cases['black_cases'] / chart_denominator,
         overall_df.cases['white_cases'] / chart_denominator,
         overall_df.cases['asian_cases'] / chart_denominator,
         overall_df.cases['nhpi_cases'] / chart_denominator,
         overall_df.cases['aian_cases'] / chart_denominator,
         overall_df.cases.sum() / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})

bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.2%'), scale=alt.Scale(domain=(0, .0026)), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
                  alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who died from COVID-19 based on incomplete CDC Case Data as of %s' % date_display_name)

bars.display()

#alt.concat(bars).properties(
#    title=alt.TitleParams(
#        ['Source: U.S. Census Bureau\'s American Community Survey 2019 5-year estimates for population data.'],
#        baseline='bottom',
#        dy=20,
#        orient='bottom',
#        fontWeight='normal',
#        fontSize=11
#    )
#).display()

But the chart above is based on incomplete data. For example, the CDC dataset says that 0% of deaths in California were Hispanic/Latino people, whereas the California public health website reports that Hispanics/Latinos made up 46% of deaths (21K people) as of February 17.

If all 93K deaths with missing race/ethnicity were Hispanic/Latino people, the percent of Hispanic/Latinos in the U.S. who died from COVID-19 would go from 0.05% to 0.21% — a 4x increase. If all 93K deaths with missing race/ethnicity were Black people, the percent of Black people who died from COVID-19 would go from 0.10% to 0.33% — a 3x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.

We can get a more complete view of the deaths within each race/ethnicity group from the NCHS state dataset, which contains 97% of the deaths in the Covid Tracking Project up to March 3, where 99% of those deaths have race/ethnicity information.

In [7]:
#@title

overall_provisional_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_overall_query, project_id=project_id)
overall_provisional_df
overall_provisional_counts_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_counts_overall_query, project_id=project_id)
overall_provisional_counts_df.head(15)

overall_provisional_df['cdc_known_cases_percent'] = round((
    overall_provisional_df.Non_Hispanic_White.fillna(0) +
    overall_provisional_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    overall_provisional_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    overall_provisional_df.Non_Hispanic_Asian.fillna(0) +
    overall_provisional_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    overall_provisional_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
overall_provisional_counts_df['cdc_known_cases'] = round(
    overall_provisional_counts_df.Non_Hispanic_White.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_Asian.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    overall_provisional_counts_df.Hispanic_or_Latino.fillna(0))
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
total_cases_including_unknowns = round(
    overall_provisional_counts_df.cdc_known_cases[0] / overall_provisional_df.cdc_known_cases_percent[0], 0)

chart_denominator = 1000
cases_list = [overall_provisional_counts_df.Hispanic_or_Latino[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_Black_or_African_American[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_White[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_Asian[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native[0] / chart_denominator,
              total_cases_including_unknowns / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})

bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.2%'), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
                  alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who died from COVID-19 based on more complete NCHS Data as of %s' % provisional_date_display_name
)

bars.display()

We can see that all of the percentages are larger due to having more complete case counts, more cases with known race/ethnicity, and two more weeks of data than in the CDC case data. In these results, 0.150% of Hispanics/Latinos have died from COVID-19, while 0.177% of Black people have died from COVID-19. That translates to 150 deaths out of every 100K Hispanic/Latino people and 177 deaths out of every 100K Black people. The group with the highest rate of deaths is American Indian/Alaska Native with 255 deaths per 100K people. These results are more similar to the Covid Racial Data Tracker and American Public Media Research Lab than the CDC case data.

Overview

The goal of this analysis is to assess the completeness of the CDC's Case Surveillance Restricted Access dataset and the NCHS datasets to evaluate their feasibility in examining disparities in race/ethnicity for COVID-19 deaths at the state and county levels.

The top-level data completeness findings for the CDC case dataset are:

  1. Data Overview: The field indicating whether the person died is only known for 53% of cases. For cases where the person died, race/ethnicity was known for 75% of deaths, as opposed to 98%-100% for all the other fields below.
In [8]:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
FieldAnalysis(project_id, cdc_table, field_list).display()
  1. Total Death Counts: The CDC dataset contains 78% of the total deaths reported in the Covid Tracking Project (CTP). There's high variability at the state level with 10 states reporting fewer than 10% of deaths and five reporting 0%.
  2. Deaths with Race/Ethnicity: Race/ethnicity data is available for 75% of deaths in the CDC dataset compared to 93% in the Covid Racial Data Tracker. The Covid Racial Tracker has less variability with all but one state reporting at least 70% of deaths with race/ethnicity.

We will also evaluate a better source for deaths data that comes from death certificates, as reported in the NCHS state dataset, the NCHS county dataset, and on this CDC dashboard. The NCHS state dataset contains 97% of the total deaths in the Covid Tracking Project and 99% of the deaths have race/ethnicity.

We used a composite measurement to evaluate the CDC case dataset and NCHS datasets' completeness as compared to the Covid Racial Data Tracker (CRDT) at the state level. We calculated the percent of total deaths that have race/ethnicity data and broke it down into its two separate components: the percentage of expected death counts included and the percentage of deaths included with race/ethnicity. We looked at the number of states that had at least 50% of total death counts with race/ethnicity and those that had at least 85%.

In [9]:
#@title
# Manually update these fields based on the latest CDC data.
# SELECT
# count(*) as count
# FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20210131`
# WHERE death_yn = 'Yes'
# AND race_ethnicity_combined NOT IN ('Unknown', 'NA', 'Missing')
# https://covidtracking.com/data/national
# County data calculated in Counties: CDC vs. NYT section.
row_names = [
    '% of CTP deaths count',
    '% with race/ethnicity',
    'Composite % of total with race/ethnicity',
    'Number of states with composite > 85%',
    '(as a percent of all states)',
    'Number of states with composite > 50%',
    '(as a percent of all states)',
]
crdt_metadata = [
    '100%', 
    '93%',
    '93%',
    '45',
    '(88%)',
    '50',
    '(98%)',
]
cdc_metadata = [
    '78%', 
    '75%',
    '58%',
    '8',
    '(17%)',
    '32',
    '(67%)',
]
cdc_provisional_metadata = [
    '97%', 
    '99%',
    '96%',
    '44',
    '(86%)',
    '50',
    '(98%)',
]
table_data = {'CRDT': crdt_metadata, 'CDC': cdc_metadata, 'NCHS': cdc_provisional_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[9]:
CRDT CDC NCHS
% of CTP deaths count 100% 78% 97%
% with race/ethnicity 93% 75% 99%
Composite % of total with race/ethnicity 93% 58% 96%
Number of states with composite > 85% 45 8 44
(as a percent of all states) (88%) (17%) (86%)
Number of states with composite > 50% 50 32 50
(as a percent of all states) (98%) (67%) (98%)

The NCHS dataset offers an excellent alternative to the case dataset at the state level. At the county level, the NCHS dataset only includes counties with 100 or more deaths, so it contains fewer counties than the case data, but those counties account for a larger percentage of the U.S. population. We will discuss the tradeoffs for county-level datasets in more detail later on in this analysis.

What we didn't include in this report:

Completeness Analysis

Total Death Counts

Baseline: NYT vs. CRDT

To get a baseline of how much we could expect the CDC death counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the NYT and CRDT death counts are equal.

In [10]:
#@title
def CreateScatterPlot(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
    axis_format = ',.0f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'
    axis_format = '.0%'

  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))
  plot = alt.Chart(chart_df).mark_circle(size=60).encode(
      alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      color=alt.Color(fields_dict['percent']['name'],
                      type='quantitative',
                      scale=alt.Scale(scheme=scale_scheme,
                                      reverse=scale_reverse,
                                      domain=scale_domain,
                                      clamp=True),
                      legend=alt.Legend(format=legend_format),
                      title=metric_type.capitalize()),
      tooltip=tooltips,
  ).properties(
      height=height,
      width=width,
  )
  if metric_type == 'ratio':
    plot.interactive()

  line = pd.DataFrame({
      'x': [0, scale_max],
      'y': [0, scale_max],
  })

  if metric_type == 'ratio':
    line_plot = alt.Chart(line).mark_line(color='black').encode(
        x='x',
        y='y',
    )
  elif metric_type == 'percent':
    line_plot = (
        alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
        alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
    )
  # Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
  scatter = (plot + line_plot).properties(
      title=title,
      height=height,
      width=width,
  ).interactive()
  return scatter

def CreateMap(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  fips_code = 'state_fips_code'
  topo_feature = us_states
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'
    fips_code = 'county_fips'
    topo_feature = us_counties

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'

  highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))

  field_names = [geo_field]
  field_names.extend([fields_dict[field]['name'] for field in fields_dict])
  plot = alt.Chart(topo_feature).mark_geoshape(
        stroke='white',
        strokeOpacity=.2,
        strokeWidth=1
    ).project(
      type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(chart_df, fips_code, field_names)
    ).encode(
        alt.Color(fields_dict['percent']['name'],
                  type='quantitative',  
                  legend=alt.Legend(format=legend_format),
                  scale=alt.Scale(scheme=scale_scheme,
                                  reverse=scale_reverse,
                                  domain=scale_domain,
                                  clamp=True,
                                  ),
                  title=metric_type.capitalize()),
         tooltip=tooltips
    ).add_selection(
        highlight,
    )

  states_outline = alt.Chart(us_states).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
        type='albersUsa'
  )

  states_fill = alt.Chart(us_states).mark_geoshape(
        fill='silver',
        stroke='white'
  ).project('albersUsa')

  layered_map = alt.layer(states_fill, plot, states_outline).properties(
        height=height,
        width=width,
        title=title,
  )
  return layered_map

def CreateScatterPlotAndMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
  scatter = CreateScatterPlot(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
  map = CreateMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
  return (scatter | map).configure_view(
       strokeWidth=0,
   ).configure_mark(
       stroke='grey'
   ).configure_legend(
       gradientLength=scatter_height - 50
   )

def PrintSummaryStats(chart_df, field='percent'):
  plus_minus_15_df = chart_df[chart_df[field] >= .85]
  plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
  print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
  plus_minus_50_df = chart_df[chart_df[field] >= .50]
  plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
  print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
  print('< than .50: ', len(chart_df[chart_df[field] < .5]))
  print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
  print(chart_df[field].describe())
In [11]:
#@title
crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)
crdt_df.set_index('state', inplace=True)

nyt_states_df = pd.io.gbq.read_gbq(nyt_states_query, project_id=project_id)
nyt_states_df.state_fips_code.unique()
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
for territory in nyt_territories:
  nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)

crdt_df.reset_index(inplace=True)
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_df.set_index('state_fips_code', inplace=True)
nyt_crdt_merged_df = nyt_states_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')

nyt_crdt_merged_df['percent'] = round(nyt_crdt_merged_df.nyt_cases / nyt_crdt_merged_df.crdt_cases, 2)
nyt_crdt_merged_df
nyt_crdt_merged_df.reset_index(inplace=True)


below_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent < .85]) / len(nyt_crdt_merged_df)
above_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent > 1.15]) / len(nyt_crdt_merged_df)
#print('between +/-15%: ', round(1 - below_15 - above_15, 2))
#nyt_crdt_merged_df.percent.describe()
In [12]:
#@title
nyt_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NYT to CRDT'},
}
nyt_crdt_title = 'Ratio of NYT to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    nyt_crdt_merged_df, nyt_crdt_fields_dict, nyt_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

States: CDC vs. CRDT

We can see below that the CDC death counts differ from the CRDT death counts much more drastically than the NYT did.

In [13]:
#@title
cdc_states_df = pd.io.gbq.read_gbq(cdc_states_query, project_id=project_id)
cdc_states_df.rename(columns={'res_state': 'state'}, inplace=True)
cdc_states_df.set_index('state', inplace=True)

crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)

for territory in territories:
  crdt_df = crdt_df[crdt_df.state != territory]

crdt_df.set_index('state', inplace=True)
cdc_crdt_merged_df = cdc_states_df.join(crdt_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_crdt_merged_df.reset_index(inplace=True)
cdc_crdt_merged_df['state_fips_code'] = cdc_crdt_merged_df.state
cdc_crdt_merged_df = cdc_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_crdt_merged_df['percent'] = round(cdc_crdt_merged_df.cdc_cases / cdc_crdt_merged_df.crdt_cases, 4)

# PrintSummaryStats(cdc_crdt_merged_df)
In [14]:
#@title
# CDC vs. NYT county

df = pd.io.gbq.read_gbq(cdc_counties_query, project_id=project_id)
for territory in territories:
  df = df[df.res_state != territory]

df_county_fips_map = pd.io.gbq.read_gbq(county_fips_mapping_query, project_id=project_id)

df_county_fips_map.cdc_county = df_county_fips_map.cdc_county.str.lower()
df_county_fips_map['state_county'] = df_county_fips_map.state + '-' + df_county_fips_map.cdc_county
df_county_fips_map['state_county'] = df_county_fips_map.state_county.astype('string').str.strip()
df_county_fips_map.set_index('state_county', inplace=True)
In [15]:
#@title
# Concatenate the state and county names because county names are not unique across states.
df.res_county = df.res_county.str.lower()
df['state_county'] = df.res_state + '-' + df.res_county
df['state_county'] = df.state_county.astype('string').str.strip()
df.set_index('state_county', inplace=True)
df['race_ethnicity_combined'] = df.race_ethnicity_combined.astype('string').str.strip()

df = df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
In [16]:
#@title
mismatches_df = df.join(df_county_fips_map, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
#print(mismatches_df.cases.sum())
#print('vs. 60363 with NULL county_fips_code')
# SELECT 
#count(*) as total_cases,
#FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20201231`
#WHERE county_fips_code IS NULL
In [17]:
#@title
merged_df = df.join(df_county_fips_map, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')

# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
crosstab_df = pd.crosstab(merged_df['county_fips'], merged_df.race_ethnicity_combined, values=merged_df.cases, aggfunc=sum,
                          margins=True,
                          margins_name='total_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['county_fips'] = crosstab_df.county_fips.astype(int)
crosstab_df['total_known_cases'] = crosstab_df['total_cases'] - crosstab_df.na_cases.fillna(0) - crosstab_df.unknown_cases.fillna(0)
In [18]:
#@title
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
df_acs_name_lookup = pd.io.gbq.read_gbq(acs_population_data_query, project_id=project_id)

df_acs_name_lookup['state_county'] = df_acs_name_lookup.county.astype('string').str.strip() + ', ' + df_acs_name_lookup.state.astype('string').str.strip()
df_acs_name_lookup.drop(columns=['state', 'county'], inplace=True)
df_acs_name_lookup.set_index('county_fips', inplace=True)

county_chart_df = crosstab_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_df.county_fips = county_chart_df.county_fips.astype(int)

#print(len(county_chart_df))
#print(county_chart_df.total_pop.sum())
#print(county_chart_df.total_pop.sum() / 324697795)  # Population covered in these counties
In [19]:
#@title

nyt_counties_df = pd.io.gbq.read_gbq(nyt_counties_query, project_id=project_id)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)

county_chart_df.set_index('county_fips', inplace=True)
nyt_merged_df = county_chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_merged_df = nyt_merged_df.reset_index()
nyt_merged_df['percent'] = round(nyt_merged_df.total_cases / nyt_merged_df.nyt_cases, 2)

#PrintSummaryStats(nyt_merged_df)
In [20]:
#@title
cdc_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of CDC to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

Counties: CDC vs. NYT

We can do the same analysis at the county level using the CDC vs. NYT data.

In [21]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC to NYT Deaths by County as of %s' % date_display_name
zoom_cdc_nyt_title = 'Zoom in on counties with up to 3,000 Deaths'

scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, county_cases_scale_max, scatter_height, scatter_width, 'county', 'ratio'
)
zoom_scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, zoom_cdc_nyt_title, county_cases_zoom_scale_max, scatter_height, scatter_width, 'county', 'ratio'
)

(scatter | zoom_scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [22]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC to NYT Deaths by County as of %s' % date_display_name

cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height, map_width, 'county', 'ratio'
)
cdc_crdt_map = CreateMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, map_height, map_width, 'state', 'ratio'
)

(cdc_crdt_map | cdc_nyt_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

Notes:

  • The legend only goes to 2.0, and all counties with a larger ratio are shown in the same dark blue color.
  • States can have deaths but no counties with deaths because county names can be suppressed for privacy reasons.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

Deaths with Race/Ethnicity

In [23]:
#@title
states_df = pd.io.gbq.read_gbq(compare_cases_unknowns_query, project_id=project_id)
for state in ('Unknown', 'NA', 'OCONUS'):
  states_df = states_df[states_df.res_state != state]

states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': {
    'Asian, Non-Hispanic': 'cdc_known_cases',
    'Black, Non-Hispanic': 'cdc_known_cases',
    'White, Non-Hispanic': 'cdc_known_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
    'Hispanic/Latino': 'cdc_known_cases',
    'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
    'Missing': 'cdc_unknown_cases',
    'Unknown': 'cdc_unknown_cases',
    'NA': 'cdc_na_cases',
    }})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
In [24]:
#@title
crosstab_df = pd.crosstab(states_df['state'], states_df.race_ethnicity_combined, values=states_df.cdc_cases, aggfunc=sum,
                          margins=True,
                          margins_name='cdc_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['cdc_known_or_na_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df['cdc_known_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df

crdt_merged_df = crosstab_df.join(crdt_df, on="state", how='inner', lsuffix='_left', rsuffix='_right')
crdt_merged_df.reset_index(inplace=True)
crdt_merged_df['state_fips_code'] = crdt_merged_df.state
crdt_merged_df = crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_merged_df['cdc_known_cases_percent'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['cdc_known_or_na_cases_percent'] = round(crdt_merged_df.cdc_known_or_na_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['percent'] = round(crdt_merged_df.cdc_cases / crdt_merged_df.crdt_cases, 4)
crdt_merged_df['percent_known_cases'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.crdt_known_race_cases, 4)

crdt_merged_df_no_ny = crdt_merged_df[crdt_merged_df.state != 'NY']
#PrintSummaryStats(crdt_merged_df_no_ny)

States: CDC vs. CRDT

How does the CDC dataset compare to the CRDT dataset, which is the most up-to-date aggregate data we have for race/ethnicity at the state level? Overall, 93% of the deaths in the CRDT dataset have known race/ethnicity compared to 75% in the CDC dataset (77% with suppressed).

We may even be undercounting the 93% of deaths with known race/ethnicity in the CRDT data because of the non-standard ways that each state reports on race/ethnicity, as described in this Covid Racial Data Tracker analysis. If a state uses a combined race/ethnicity field, then it's a straightforward comparison to the CDC's combined race/ethnicity field. If a state uses separate fields for race/ethnicity, then we still use the number of people with known race within each state because all of the race categories will also contain Hispanic/Latino people. We could potentially be undercounting the number of people with known race/ethnicity in the CRDT if there are people who have unknown race but known ethnicity. If we adjusted the numbers in those cases, it would make the CRDT percentages look even better in comparison to the CDC data.

In [25]:
#@title
cdc_known_state_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

cdc_known_state_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_state_map = CreateMap(
    crdt_merged_df, cdc_known_state_fields_dict, cdc_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

crdt_known_state_fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

crdt_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name
crdt_known_map = CreateMap(
    cdc_crdt_merged_df, crdt_known_state_fields_dict, crdt_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(crdt_known_map | cdc_known_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [26]:
#@title
fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
    'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of CDC to CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name

CreateScatterPlotAndMap(
    crdt_merged_df, fields_dict, title, cases_known_scale_max, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
In [27]:
#@title
#print('CRDT known % ', crdt_merged_df.crdt_known_race_cases.sum() / crdt_merged_df.crdt_cases.sum())
#PrintSummaryStats(cdc_crdt_merged_df, field='crdt_known_race_cases_percent')

Counties: CDC

We can also look at the percentage of cases with known race/ethnicity at the county level alongside the state-level numbers.

In [28]:
#@title

chart_df = county_chart_df.copy(deep=True)
chart_df.reset_index(inplace=True)
chart_df.county_fips = chart_df.county_fips.astype(int)
chart_df['percent_known_cases'] = round(chart_df.total_known_cases / chart_df.total_cases, 2)
chart_df['total_known_or_na_cases'] = chart_df.total_known_cases.fillna(0) + chart_df.na_cases.fillna(0)
chart_df['percent_known_or_na_cases'] = round(chart_df.total_known_or_na_cases / chart_df.total_cases, 2)
In [29]:
#@title
cdc_known_county_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_state_map | cdc_known_county_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

Later on, we'll look at the effect of data suppression on these numbers at the county level, especially counties with small populations.

In [30]:
#@title
#PrintSummaryStats(crdt_merged_df, field='cdc_known_cases_percent')
#PrintSummaryStats(crdt_merged_df, field='cdc_known_or_na_cases_percent')
#tuple(crdt_merged_df[crdt_merged_df.cdc_known_or_na_cases_percent <= .5].state)

Note: A larger version of the county maps for hovering over smaller counties is available in the Appendix.

Better Data from the NCHS

The CDC has an alternative public source for death data with race/ethnicity at the state and county levels that comes from death certificates via the NCHS. This dataset is more complete than the case surveillance dataset in every way except that the county-level dataset only contains about one fourth of the counties in the CDC case data. The reason for this is that the NCHS county dataset only includes counties with 100 or more deaths. The NCHS state dataset also has a limit of 100 or more deaths per state, but all states had reached that threshold by Mar 3, 2021.

The table below summarizes the completeness of the CDC Case dataset vs. the NCHS datasets for counties and states.

In [31]:
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
    'Update frequency',
    'Latest deaths date',
    'Deaths in dataset as of date',
    'Deaths in CTP as of date',
    '(as a % of CTP)',
    'Number of counties',
    '(as a % of all counties)',
    'Population in those counties',
    '(as a % of total U.S population – States + D.C.)',
    'Deaths with known race/ethnicity',
    '(as a % of deaths in dataset)',
    'Access',
    'Limitations'
]
cdc_deaths_county_metadata = [
    'Weekly',
    'Mar 3, 2021',
    '439K',
    '509K',
    '(86%)',
    '712',
    '(23%)',
    '260K',
    '(80%)',
    '432K',
    '(98%)',
    'Public',
    'Counties >= 100 deaths'
]
cdc_deaths_state_metadata = [
    'Weekly',
    'Mar 3, 2021',
    '494K',
    '509K',
    '(97%)',
    '-',
    '-',
    '-',
    '-',
    '491K',
    '(99%)',
    'Public',
    '-'
]
cdc_cases_metadata = [
    'Monthly', 
    'Feb 13, 2021',
    '372K',
    '474K',
    '(78%)',
    '2,391',
    '(76%)',
    '280M',
    '(86%)',
    '279K',
    '(75%)',
    'Restricted',
    'Data completeness issues'
]
table_data = {'CDC': cdc_cases_metadata,
              'NCHS state': cdc_deaths_state_metadata,
              'NCHS county': cdc_deaths_county_metadata,
}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[31]:
CDC NCHS state NCHS county
Update frequency Monthly Weekly Weekly
Latest deaths date Feb 13, 2021 Mar 3, 2021 Mar 3, 2021
Deaths in dataset as of date 372K 494K 439K
Deaths in CTP as of date 474K 509K 509K
(as a % of CTP) (78%) (97%) (86%)
Number of counties 2,391 - 712
(as a % of all counties) (76%) - (23%)
Population in those counties 280M - 260K
(as a % of total U.S population – States + D.C.) (86%) - (80%)
Deaths with known race/ethnicity 279K 491K 432K
(as a % of deaths in dataset) (75%) (99%) (98%)
Access Restricted Public Public
Limitations Data completeness issues - Counties >= 100 deaths

The NCHS datasets are more complete at the U.S. level on all measures except for the number of counties contained in the county-level data. The American Public Media Research Lab found that the NCHS state dataset was more complete than public health website data for 15 states.

Total Death Counts

In [32]:
#@title

cdc_provisional_deaths_state_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_query, project_id=project_id)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.replace(
    to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_counts_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_counts_query, project_id=project_id)
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.replace(
    to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_df['cdc_known_cases_percent'] = round((
    cdc_provisional_deaths_state_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_state_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
cdc_provisional_deaths_state_counts_df['cdc_known_cases'] = round(
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Hispanic_or_Latino.fillna(0))
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.set_index('State')
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.set_index('State')
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.join(
    cdc_provisional_deaths_state_counts_df, on="State", how='inner', lsuffix='_left', rsuffix='_right')
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
cdc_provisional_deaths_state_df['cdc_cases'] = round(
    cdc_provisional_deaths_state_df.cdc_known_cases / cdc_provisional_deaths_state_df.cdc_known_cases_percent, 0)
# Comnbine the case counts for NY and New York City, recalculate percentage, remove New York City. 
cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] = (
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] +
    cdc_provisional_deaths_state_df.loc['New York City', 'cdc_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] = (
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] +
    cdc_provisional_deaths_state_df.loc['New York City', 'cdc_known_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases_percent'] = round(
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] /
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'], 4)
cdc_provisional_deaths_state_df.drop(['New York City'], inplace=True)
cdc_provisional_deaths_state_df.reset_index(inplace=True)
cdc_provisional_deaths_state_df.rename(columns={'State': 'state'}, inplace=True)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df[
    ['state', 'cdc_cases', 'cdc_known_cases', 'cdc_known_cases_percent']].copy()

crdt_provisional_df = pd.io.gbq.read_gbq(crdt_provisional_query, project_id=project_id)
crdt_provisional_df.set_index('state', inplace=True)
cdc_provisional_crdt_merged_df = cdc_provisional_deaths_state_df.join(crdt_provisional_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_provisional_crdt_merged_df.reset_index(inplace=True)
cdc_provisional_crdt_merged_df['state_fips_code'] = cdc_provisional_crdt_merged_df.state
cdc_provisional_crdt_merged_df = cdc_provisional_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_provisional_crdt_merged_df['percent'] = round(cdc_provisional_crdt_merged_df.cdc_cases / cdc_provisional_crdt_merged_df.crdt_cases, 4)
cdc_provisional_crdt_merged_df['percent_known_cases'] = round(cdc_provisional_crdt_merged_df.cdc_known_cases / cdc_provisional_crdt_merged_df.crdt_known_race_cases, 4)

# Counties data

df_acs_name_lookup = pd.io.gbq.read_gbq(acs_population_data_query, project_id=project_id)
df_acs_name_lookup.set_index('county_fips', inplace=True)

cdc_provisional_deaths_county_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_county_query, project_id=project_id)

cdc_provisional_deaths_county_df['county_fips'] = cdc_provisional_deaths_county_df.FIPS_Code
cdc_provisional_deaths_county_df.set_index('county_fips', inplace=True)
cdc_provisional_deaths_county_df['state_county'] = cdc_provisional_deaths_county_df.County_Name + ', ' + cdc_provisional_deaths_county_df.State
cdc_provisional_deaths_county_df['total_known_cases'] = round((
    cdc_provisional_deaths_county_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Black.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_county_df.Hispanic.fillna(0)) * cdc_provisional_deaths_county_df.COVID_19_Deaths, 0)

county_chart_provisional_df = cdc_provisional_deaths_county_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_provisional_df.reset_index(inplace=True)
county_chart_provisional_df.county_fips = county_chart_provisional_df.county_fips.astype(int)

nyt_counties_provisional_df = pd.io.gbq.read_gbq(nyt_counties_provisional_query, project_id=project_id)
nyt_counties_provisional_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_provisional_df.county_fips.unique()
nyt_counties_provisional_df['county_fips'] = nyt_counties_provisional_df.county_fips.astype(int)
nyt_counties_provisional_df.set_index('county_fips', inplace=True)

county_chart_provisional_df.set_index('county_fips', inplace=True)
nyt_merged_provisional_df = county_chart_provisional_df.join(nyt_counties_provisional_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
nyt_merged_provisional_df = nyt_merged_provisional_df.reset_index()
nyt_merged_provisional_df['percent'] = round(nyt_merged_provisional_df.COVID_19_Deaths / nyt_merged_provisional_df.nyt_cases, 4)
nyt_merged_provisional_df['percent_known_cases'] = round(nyt_merged_provisional_df.total_known_cases / nyt_merged_provisional_df.COVID_19_Deaths, 4)

#print('total deaths: ', nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print('total counties: ', len(county_chart_provisional_df))
#print('as % of counties: ', len(county_chart_provisional_df) / 3143)
#print('population in counties: ', county_chart_provisional_df.total_pop.sum())
#print('as % of total population: ', county_chart_provisional_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print('known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum())
#print('% known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum() / nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print(0.55 * 324697795) # NYT population
#print('state file deaths: ', cdc_provisional_crdt_merged_df.cdc_cases.sum())
#print('state file known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum())
#print('state file % known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum() / cdc_provisional_crdt_merged_df.cdc_cases.sum())
In [33]:
#@title
cdc_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of NCHS to CRDT Deaths as of %s' % provisional_date_display_name

CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()
In [34]:
#@title
cdc_nyt_provisional_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_provisional_title = 'Ratio of NCHS to NYT Deaths as of %s' % provisional_date_display_name

scatter = CreateScatterPlotAndMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, 18000, scatter_height, scatter_width, map_width, 'county', 'ratio'
)

(scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Deaths with Race/Ethnicity

In the charts and maps below, we can see that the NCHS datasets have a strikingly high percentage of deaths with known race/ethnicity. The state dataset had both counts and percentages of deaths from COVID-19 within each race/ethnicity group but not a total deaths count. There is an "Other" category that includes "More than one race or Unknown," and we treated that entire category as Unknown race/ethnicity. The percentages for all the categories including Other didn't always add up to 100%, so we treated the remaining percentage as Unknowns as well (data for a group can be suppressed if that group had fewer than 10 deaths). The county dataset included only percentages of deaths from COVID-19 within each race/ethnicity group, but it did include a total count per county so that we could calculate counts.

In [35]:
#@title

crdt_provisional_known_state_fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

crdt_provisional_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
crdt_provisional_known_map = CreateMap(
    cdc_provisional_crdt_merged_df, crdt_provisional_known_state_fields_dict, crdt_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
cdc_provisional_known_state_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

cdc_provisional_known_state_title = 'NCHS Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_provisional_known_state_map = CreateMap(
    cdc_provisional_crdt_merged_df, cdc_provisional_known_state_fields_dict, cdc_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(crdt_provisional_known_map | cdc_provisional_known_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [36]:
#@title
fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
    'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of NCHS to CRDT Deaths with Known Race/Ethnicity'

CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, fields_dict, title, 40000, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
In [37]:
#@title
cdc_known_county_provisional_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_provisional_title = 'NCHS Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_known_county_provisional_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_provisional_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

What State and County Data are Usable?

How can we evaluate the completeness of race/ethnicity data at the state and county levels so that we know which data are usable?

In the Total Death Counts section above, we identified the states and counties with the biggest discrepancies relative to the total death counts from aggregate data. In the Deaths with Race/Ethnicity section, we looked at the percentage of deaths within each state and county that have race/ethnicity data.

The charts below show those two components together; the scatterplots show (1) the CDC death counts as a percentage of the CRDT/NYT total case counts on the y-axis, and (2) the percentage of CDC deaths with known race/ethnicity on the x-axis. The colors of the dots and on the map show the composite measure of completeness by multiplying those two numbers together, which is the percentage of expected total deaths that have race/ethnicity in the CDC dataset.

The scatterplots below can help us identify which states and counties have usable data and what the completeness issues are:

  • Bottom left quadrant: Low percentage of deaths reported, low availability of race/ethnicity.
  • Top left quadrant: Mid-to-high percentage of deaths reported, low availability of race/ethnicity.
  • Bottom right quadrant: Low percentage of deaths reported, mid-to-high availability of race/ethnicity.
  • Top right quadrant: Mid-to-high percentage of deaths reported, mid-to-high availability of race/ethnicity.
In [38]:
#@title
nyt_cdc_known_merged_df = chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_cdc_known_merged_df.reset_index(inplace=True)
nyt_cdc_known_merged_df['percent'] = round(nyt_cdc_known_merged_df.total_cases / nyt_cdc_known_merged_df.nyt_cases, 2)
In [39]:
#@title
crdt_merged_df['percent_max_100'] = crdt_merged_df.percent.clip(upper=1)
crdt_merged_df['percent_reccs'] = crdt_merged_df.percent_max_100 * crdt_merged_df.cdc_known_cases_percent
state_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
    'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name

scatter = CreateScatterPlotAndMap(
    crdt_merged_df, state_reccs_fields_dict, state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [40]:
#@title
nyt_cdc_known_merged_df['percent_max_100'] = nyt_cdc_known_merged_df.percent.clip(upper=1)
nyt_cdc_known_merged_df['percent_reccs'] = nyt_cdc_known_merged_df.percent_max_100 * nyt_cdc_known_merged_df.percent_known_cases
county_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_title = state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name


scatter = CreateScatterPlotAndMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

In the NCHS datasets, almost all states and counties are in the top right quadrant, which means that they have a mid-high percentage of total deaths and a mid-high percentage of deaths with race/ethnicity.

In [41]:
#@title
cdc_provisional_crdt_merged_df['percent_max_100'] = cdc_provisional_crdt_merged_df.percent.clip(upper=1)
cdc_provisional_crdt_merged_df['percent_reccs'] = cdc_provisional_crdt_merged_df.percent_max_100 * cdc_provisional_crdt_merged_df.cdc_known_cases_percent
provisional_state_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
    'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
provisional_state_reccs_title = 'NCHS Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name

scatter = CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, provisional_state_reccs_fields_dict, provisional_state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [42]:
#@title
nyt_merged_provisional_df['percent_max_100'] = nyt_merged_provisional_df.percent.clip(upper=1)
nyt_merged_provisional_df['percent_reccs'] = nyt_merged_provisional_df.percent_max_100 * nyt_merged_provisional_df.percent_known_cases
county_reccs_provisional_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_provisional_title = state_reccs_title = 'NCHS Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name

scatter_provisional = CreateScatterPlotAndMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter_provisional.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

We can get an overall measure of completeness if we look at the number of states in the top right corner of the scatterplot where the composite score is > 85% (where the dots turn dark blue) and > 50% (where the dots turn yellow).

In [43]:
#@title
#PrintSummaryStats(crdt_merged_df, field='percent_reccs')
#PrintSummaryStats(nyt_cdc_known_merged_df, field='percent_reccs')
#greater_than_85_df = nyt_cdc_known_merged_df[nyt_cdc_known_merged_df['percent_reccs'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = nyt_cdc_known_merged_df[nyt_cdc_known_merged_df['percent_reccs'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
In [44]:
#@title
#PrintSummaryStats(cdc_provisional_crdt_merged_df, field='percent_reccs')
#PrintSummaryStats(nyt_merged_provisional_df, field='percent_reccs')
#greater_than_85_df = nyt_merged_provisional_df[nyt_merged_provisional_df['percent_reccs'] > .85]
#print('total pop > 85%: %d, %f', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = nyt_merged_provisional_df[nyt_merged_provisional_df['percent_reccs'] > .50]
#print('total pop > 50%: %d, %f', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
In [45]:
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
    'Number of states with composite > 85%',
    '(as a percent of all states)',
    'Number of states with composite > 50%',
    '(as a percent of all states)',
    'Number of counties with composite > 85%',
    '(as a percent of all counties)',
    'Number of counties with composite > 50%',
    '(as a percent of all counties)',
    'Population in counties with composite > 85%',
    '(as a % of total U.S population – States + D.C.)',
    'Population in counties with composite > 50%',
    '(as a % of total U.S population – States + D.C.)',
]
crdt_metadata = [
    '45',
    '(88%)',
    '50',
    '(98%)',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
]
cdc_state_provisional_metadata = [
    '44',
    '(86%)',
    '50',
    '(98%)',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
]
cdc_metadata = [
    '8',
    '(17%)',
    '32',
    '(67%)',
    '560',
    '(18%)',
    '1,560',
    '(50%)',
    '77M',
    '(24%)',
    '185M',
    '(56%)',
]
cdc_county_provisional_metadata = [
    '-',
    '-',
    '-',
    '-',
   '519',
    '(16%)',
    '690',
    '(22%)',
    '189M',
    '(58%)',
    '249M',
    '(76%)',
]
table_data = {'CRDT': crdt_metadata,
              'CDC': cdc_metadata,
              'NCHS state': cdc_state_provisional_metadata,
              'NCHS county': cdc_county_provisional_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[45]:
CRDT CDC NCHS state NCHS county
Number of states with composite > 85% 45 8 44 -
(as a percent of all states) (88%) (17%) (86%) -
Number of states with composite > 50% 50 32 50 -
(as a percent of all states) (98%) (67%) (98%) -
Number of counties with composite > 85% - 560 - 519
(as a percent of all counties) - (18%) - (16%)
Number of counties with composite > 50% - 1,560 - 690
(as a percent of all counties) - (50%) - (22%)
Population in counties with composite > 85% - 77M - 189M
(as a % of total U.S population – States + D.C.) - (24%) - (58%)
Population in counties with composite > 50% - 185M - 249M
(as a % of total U.S population – States + D.C.) - (56%) - (76%)

If we require that states or counties have 85% of total expected deaths with race/ethnicity, we can use the NCHS state dataset for 86% of states vs. 17% from the CDC case dataset. We can use the NCHS county dataset for 16% of counties vs. 18% from the CDC case dataset, where the NCHS's counties account for 58% of the U.S. population vs. 24% from the CDC case dataset.

If we loosen that requirement to 50% of total expected deaths with race/ethnicity, we can use the NCHS state dataset for 98% of states vs. 67% from the CDC case dataset. We can use the NCHS county dataset for 22% of counties vs. 50% from the CDC case dataset, where the NCHS's counties account for 76% of the U.S. population vs. 56% from the CDC case dataset.

If you are looking for state- or county-level deaths data with race/ethnicity, the NCHS datasets are more complete than the CDC Case Surveillance dataset in almost every way: 98%-99% of the deaths reported have race/ethnicity, they generally do not lag far behind state website data as reported in the CRDT/NYT, and they are updated once a week rather than once a month.

However, there are a few tradeoffs with using the NCHS state dataset:

  • The data are cumulative, so you will not be able to analyze deaths over time unless you collect the weekly updates from the CDC.
  • Age and sex data are available at the state level only in a separate dataset. All the NCHS datasets are listed at the CDC site.

There are a few additional tradeoffs with using the NCHS county dataset:

  • Only 23% of counties are included in the county dataset even though those counties account for 80% of the U.S. population. Fewer counties in the NCHS dataset are above the 85% and 50% composite measure than in the CDC Case data, but those counties account for a larger percentage of the U.S. population.
  • The NCHS county dataset should not be used at the state or U.S. levels due to the exclusion of counties with fewer than 100 COVID-19 deaths, which are generally counties with small populations. The NCHS state dataset should be used instead. The county dataset could be used for trends across the U.S. in counties with large populations.

How to Improve State and County Data

There are two ways in which states can improve the data they report to the CDC:

  1. Increase the total deaths reported to get closer to the aggregate data.
  2. Increase the percentage of deaths reported with known race/ethnicity to get closer to 100%.

When evaluating the percent of deaths that report on race/ethnicity in the CDC dataset, we also need to consider the 2% of overall deaths with race/ethnicity that were suppressed due to privacy reasons. We should give states and counties credit for reporting race/ethnicity data for those deaths even if we aren't able to use it due to privacy suppression. Below, the map on the top left shows the percent of deaths with known race/ethnicity and the map on the top right shows the percent of deaths with known or suppressed race/ethnicity. The maps below that show the same information at the county level.

In [46]:
#@title
cdc_known_or_na_state_fields_dict = {
    'x': {'name': 'cdc_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_or_na_cases_percent', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_state_title = 'CDC Deaths with Known or Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_state_map = CreateMap(
    crdt_merged_df, cdc_known_or_na_state_fields_dict, cdc_known_or_na_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(cdc_known_state_map | cdc_known_or_na_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [47]:
#@title
cdc_known_county_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

cdc_known_or_na_county_fields_dict = {
    'x': {'name': 'total_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_or_na_cases', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_county_title = 'CDC Deaths with Known or Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_map | cdc_known_or_na_county_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

Note: A larger version of the county maps for hovering over smaller counties is available in the Appendix.

We can see that race/ethnicity data is missing in some states such as California, North Dakota, Pennsylvania, and New York.

But the main way to improve the completeness of the CDC case data is for more states and counties to report whether a person died; this information is missing from most or all counties in Texas, New Mexico, South Dakota, Nebraska, Missouri, West Virginia, and Maryland.

Appendix

Large county maps

To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.

In [48]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_nyt_map.display()
In [49]:
#@title
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)

cdc_known_county_map.display()
In [50]:
#@title
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_known_or_na_county_map.display()
In [51]:
#@title
county_completeness = CreateMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)
county_completeness.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [52]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
)

cdc_nyt_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [53]:
#@title
cdc_known_county_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
)

cdc_known_county_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [54]:
#@title
map = CreateMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)

map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Geographic Notes

The NCHS state dataset had separate rows for New York State and New York City; we combined them into New York State.

Data Citations and Disclaimers

  • CDC data full citation: Centers for Disease Control and Prevention, COVID-19 Response. COVID-19 Case Surveillance Data Access, Summary, and Limitations (version date: February 28, 2021).
  • Per the CDC data agreement: The CDC does not take responsibility for the scientific validity or accuracy of methodology, results, statistical analyses, or conclusions presented.
  • NCHS Data: State, County, and dataset index. National Center for Health Statistics (NCHS), National Vital Statistics System. Estimates are based on provisional data.
  • Population data: U.S. Census Bureau's American Community Survey 2019 5-year estimates accessed via API; e.g., sample query.
  • Covid Racial Data Tracker data: Available in a public spreadsheet.
  • New York Times data: Available as a public CSV file or via Google Cloud Public Datasets.

Contact information

Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.

In [55]:
#@title
#%%shell
#jupyter nbconvert --to html 'cdc_death_data.ipynb'